1. Load and Clean 2020 Census Data
# (1) Load demographic data
credential <- Sys.getenv("census_api_key")
lookup_var <- load_variables(2020, "acs5", cache = TRUE)
glimpse(lookup_var)
## Rows: 27,850
## Columns: 3
## $ name <chr> "B01001_001", "B01001_002", "B01001_003", "B01001_004", "B0100…
## $ label <chr> "Estimate!!Total:", "Estimate!!Total:!!Male:", "Estimate!!Tota…
## $ concept <chr> "SEX BY AGE", "SEX BY AGE", "SEX BY AGE", "SEX BY AGE", "SEX B…
race <- c(white = "B02001_002", #race
black = "B02001_003",
indian_alaska = "B02001_004",
asian = "B02001_005",
pacific = "B02001_006")
education_attainment <- c(no_schooling = "B15003_002", #education_attainment
elementary_school = "B15003_009",
middle_school = "B15003_012",
high_school = "B15003_017",
bachelor = "B15003_022")
employment_status <- c(employed = "B23025_004", # employment_status
unemployed = "B23025_005",
no_in_labor_force = "23025_007")
sex_by_age <- c(total_by_age ="B01001_001", #sex_by_age
male_by_age_total = "B01001_002",
female_by_age_total = "B01001_026")
demogdata <- get_acs(
geography = "tract",
year = 2020,
variables = c(race,
education_attainment,
employment_status,
sex_by_age,
hhincome = "B19013_001", # median household income
poverty = "B17020_002"), # poverty level
key = credential,
state = 36,
county = c(005, 047, 061, 081, 085))
# (2) Tidy Demographic Data
demo_clean <- demogdata %>%
select(-moe) %>%
#Writes all column names in lower_case
rename_all(~str_to_lower(.)) %>%
#Replaces the white space in column names with underscores
rename_all(~str_replace_all(., " ", "_")) %>%
separate(name, c("census_tract","county", "state"), sep = ", ") %>%
select(-state) %>%
pivot_wider(names_from = variable,
values_from = estimate)
# (3) Missing Data Imputation
# Decided to just impute with the mean-replacing NAs with income average per county
mean_impute <- demo_clean %>%
select(geoid,county, hhincome) %>%
group_by(county) %>%
filter(!is.na(hhincome)) %>%
summarise(hhincome = mean(hhincome))
mean_impute
## # A tibble: 5 × 2
## county hhincome
## <chr> <dbl>
## 1 Bronx County 47615.
## 2 Kings County 71143.
## 3 New York County 104136.
## 4 Queens County 77123.
## 5 Richmond County 85720.
demo_clean <- demo_clean %>%
left_join(mean_impute, by = c("county")) %>%
mutate(hhincome = ifelse(is.na(hhincome.x), hhincome.y, hhincome.x)) %>%
select(-hhincome.y, -hhincome.x)
view(demo_clean)
# (4) Mutate more variables
# Calculate the percentage of minorities
demo_clean$rate_of_minorities <- round((demo_clean$black + demo_clean$asian + demo_clean$indian_alaska + demo_clean$pacific)/(demo_clean$white + demo_clean$black + demo_clean$asian + demo_clean$indian_alaska + demo_clean$pacific) * 100, 1) %>%
replace(., is.na(.), 0)
# Calculate the employment rate
demo_clean$employment_rate <- round((demo_clean$employed)/(demo_clean$employed + demo_clean$unemployed) * 100, 1) %>%
replace(., is.nan(.), 0)
# (5) Add Geometry and Convert to sf Object
geo <- tracts(
state = 36,
county = c(005, 047, 061, 081, 085),
cb = TRUE
) %>%
st_transform(crs = 4326) %>%
select(GEOID, geometry)
##
|
| | 0%
|
|= | 1%
|
|= | 2%
|
|== | 2%
|
|== | 3%
|
|=== | 4%
|
|=== | 5%
|
|==== | 6%
|
|===== | 7%
|
|===== | 8%
|
|====== | 8%
|
|====== | 9%
|
|======= | 10%
|
|======= | 11%
|
|======== | 11%
|
|======== | 12%
|
|========= | 13%
|
|========== | 14%
|
|========== | 15%
|
|=========== | 16%
|
|============ | 17%
|
|============ | 18%
|
|============= | 18%
|
|============= | 19%
|
|============== | 20%
|
|============== | 21%
|
|=============== | 21%
|
|================ | 22%
|
|================ | 23%
|
|================= | 24%
|
|================== | 25%
|
|================== | 26%
|
|=================== | 27%
|
|==================== | 28%
|
|==================== | 29%
|
|===================== | 30%
|
|====================== | 31%
|
|======================= | 32%
|
|======================= | 33%
|
|======================== | 34%
|
|========================= | 35%
|
|========================= | 36%
|
|========================== | 37%
|
|=========================== | 38%
|
|=========================== | 39%
|
|============================ | 40%
|
|============================= | 41%
|
|============================== | 42%
|
|============================== | 43%
|
|=============================== | 44%
|
|================================ | 45%
|
|================================ | 46%
|
|================================= | 47%
|
|================================== | 48%
|
|================================== | 49%
|
|=================================== | 50%
|
|==================================== | 51%
|
|==================================== | 52%
|
|===================================== | 52%
|
|===================================== | 53%
|
|====================================== | 54%
|
|====================================== | 55%
|
|======================================= | 55%
|
|======================================= | 56%
|
|======================================== | 57%
|
|======================================== | 58%
|
|========================================= | 58%
|
|========================================= | 59%
|
|========================================== | 60%
|
|=========================================== | 61%
|
|=========================================== | 62%
|
|============================================ | 62%
|
|============================================ | 63%
|
|============================================= | 64%
|
|============================================= | 65%
|
|============================================== | 65%
|
|============================================== | 66%
|
|=============================================== | 67%
|
|=============================================== | 68%
|
|================================================ | 68%
|
|================================================ | 69%
|
|================================================= | 70%
|
|================================================== | 71%
|
|================================================== | 72%
|
|=================================================== | 73%
|
|==================================================== | 74%
|
|==================================================== | 75%
|
|===================================================== | 76%
|
|====================================================== | 77%
|
|====================================================== | 78%
|
|======================================================= | 79%
|
|======================================================== | 80%
|
|========================================================= | 81%
|
|========================================================= | 82%
|
|========================================================== | 83%
|
|=========================================================== | 84%
|
|=========================================================== | 85%
|
|============================================================ | 86%
|
|============================================================= | 87%
|
|============================================================= | 88%
|
|============================================================== | 89%
|
|=============================================================== | 90%
|
|================================================================ | 91%
|
|================================================================ | 92%
|
|================================================================= | 93%
|
|================================================================== | 94%
|
|================================================================== | 95%
|
|=================================================================== | 96%
|
|==================================================================== | 97%
|
|==================================================================== | 98%
|
|===================================================================== | 99%
|
|======================================================================| 100%
demography_sf <- left_join(
x = demo_clean,
y = geo,
by = c("geoid" = "GEOID")) %>%
st_as_sf() %>%
st_transform(crs = 4326) %>%
mutate(geoid = as.numeric(geoid))
3. Set Up Models for the Floodplain Data
# joins floodplain with census tract map and codes tracts by whether they are in floodplain
sf_use_s2(FALSE)
floodtracts <- st_join(geo,floodplain500, join = st_intersects, left = FALSE)
flood_demography_sf <- demography_sf %>%
mutate(flood = ifelse(geoid %in% floodtracts$GEOID, 1, 0)) %>%
na.omit() %>%
mutate(flood = factor(flood))
# splits data into training and testing groups (keep both sf and df objects because we need sf objects to plot both training data in EDA and testing data in assessment part, and we need data frame objects to model which do not deal with geometry information)
set.seed(20220422)
flood_split <- initial_split(flood_demography_sf, prop = 0.7, strata = "flood")
flood_train_sf <- training(flood_split)
flood_test_sf <- testing(flood_split)
flood_train_df <- flood_train_sf %>%
st_centroid() %>%
mutate(lat = unlist(map(geometry, 1)), lon = unlist(map(geometry, 2))) %>%
na.omit() %>%
as_tibble() %>%
select(-c(geometry, geoid, census_tract, county))
flood_test_df <- flood_test_sf %>%
st_centroid() %>%
mutate(lat = unlist(map(geometry, 1)), lon = unlist(map(geometry, 2))) %>%
na.omit() %>%
as_tibble() %>%
select(-geometry, geoid, census_tract, county)
#Visualize the relationship between demographic data and flooding
#Descriptive Analysis
flood_train_sf %>%
summary()
## geoid census_tract county total_by_age
## Min. :3.601e+10 Length:1628 Length:1628 Min. : 0
## 1st Qu.:3.605e+10 Class :character Class :character 1st Qu.: 2216
## Median :3.606e+10 Mode :character Mode :character Median : 3402
## Mean :3.606e+10 Mean : 3598
## 3rd Qu.:3.608e+10 3rd Qu.: 4669
## Max. :3.609e+10 Max. :16600
## male_by_age_total female_by_age_total white black
## Min. : 0 Min. : 0 Min. : 0 Min. : 0.0
## 1st Qu.:1069 1st Qu.:1135 1st Qu.: 408 1st Qu.: 53.0
## Median :1603 Median :1754 Median : 1124 Median : 264.5
## Mean :1719 Mean :1879 Mean : 1487 Mean : 855.9
## 3rd Qu.:2250 3rd Qu.:2454 3rd Qu.: 2103 3rd Qu.:1426.0
## Max. :7280 Max. :9320 Max. :11625 Max. :7248.0
## indian_alaska asian pacific no_schooling
## Min. : 0.00 Min. : 0.0 Min. : 0.00 Min. : 0.0
## 1st Qu.: 0.00 1st Qu.: 61.0 1st Qu.: 0.00 1st Qu.: 17.0
## Median : 0.00 Median : 233.5 Median : 0.00 Median : 50.5
## Mean : 14.88 Mean : 519.0 Mean : 2.32 Mean : 77.2
## 3rd Qu.: 11.00 3rd Qu.: 679.8 3rd Qu.: 0.00 3rd Qu.:109.0
## Max. :971.00 Max. :5687.0 Max. :405.00 Max. :785.0
## elementary_school middle_school high_school bachelor
## Min. : 0.00 Min. : 0.00 Min. : 0.0 Min. : 0.0
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 259.5 1st Qu.: 246.0
## Median : 4.00 Median : 22.00 Median : 449.5 Median : 428.5
## Mean : 18.67 Mean : 41.48 Mean : 505.3 Mean : 574.9
## 3rd Qu.: 25.00 3rd Qu.: 56.00 3rd Qu.: 696.0 3rd Qu.: 752.2
## Max. :417.00 Max. :798.00 Max. :2685.0 Max. :5903.0
## poverty employed unemployed hhincome
## Min. : 0.0 Min. : 0 Min. : 0.0 Min. : 2499
## 1st Qu.: 181.8 1st Qu.:1020 1st Qu.: 45.0 1st Qu.: 50856
## Median : 401.5 Median :1592 Median : 92.0 Median : 70978
## Mean : 617.1 Mean :1731 Mean : 122.5 Mean : 74456
## 3rd Qu.: 816.5 3rd Qu.:2222 3rd Qu.: 168.0 3rd Qu.: 90242
## Max. :4350.0 Max. :9758 Max. :1129.0 Max. :250001
## rate_of_minorities employment_rate geometry flood
## Min. : 0.00 Min. : 0.00 MULTIPOLYGON :1628 0:1054
## 1st Qu.: 20.80 1st Qu.: 90.80 epsg:4326 : 0 1: 574
## Median : 46.60 Median : 94.10 +proj=long...: 0
## Mean : 48.52 Mean : 89.68
## 3rd Qu.: 77.55 3rd Qu.: 96.40
## Max. :100.00 Max. :100.00
# Relationship Between Rate of Employment, Rate of Minorities and Poverty
flood_train_sf %>%
# select(c(-geometry, -Lat, -Lon)) %>%
ggplot(aes(x=employment_rate, y=rate_of_minorities)) +
geom_point(aes(color = poverty), alpha = 0.4) +
labs(title = "Relationship Between Rate of Employment, Concentration of Minorities and Poverty \nBased on All Counties in New York City",
subtitle = "No clear relationship found among the three variables",
caption = "Data source: 2019 1-year ACS, US Census Bureau",
x = "Employment Rate (%)",
y = "Rate of Minorities (%)",
color = "Poverty"
)

theme_minimal()
#Flooding Data on Map
#Graph to show median household income in floodplain by county
flood_train_sf_plot <- flood_train_sf %>%
group_by(geoid, county, census_tract) %>%
mutate(tooltip = paste(county, census_tract, hhincome, sep = ": "))
plot3 <- ggplot() +
geom_sf_interactive(data = flood_train_sf_plot, color = "white", aes(fill = hhincome,
tooltip = tooltip, data_id = hhincome), size = 0.2) +
geom_sf(data = filter(flood_train_sf, flood == 1), fill = "red", alpha = 0.5) +
scale_fill_distiller(palette = "Blues", direction = 1, labels = label_dollar()) +
labs(title = "Median Household Income in New York City",
subtitle = "By Counties in New York City",
caption = "Data source: 2019 1-year ACS, US Census Bureau",
fill = "Household Income") +
theme_void()
girafe(ggobj = plot3) %>%
girafe_options(opts_hover(css = "fill:orange;"),
opts_zoom(max = 10))
#Graph to show rate of minorities in floodplain by county
flood_train_sf_plot <- flood_train_sf %>%
group_by(geoid, county, census_tract) %>%
mutate(tooltip = paste(county, census_tract, rate_of_minorities, sep = ": "))
P4 <- ggplot() +
geom_sf_interactive(
data = flood_train_sf_plot,
color = "white",
aes(fill = rate_of_minorities, tooltip = tooltip, data_id = rate_of_minorities),
size = 0.5) +
geom_sf(
data = filter(flood_train_sf, flood == 1),
fill = "red",
color = "red",
alpha = 0.2,
show.legend = "point"
) +
scale_fill_viridis_c() +
# scale_fill_distiller(palette = "Blues", direction = 1) +
labs(
title = "Concentration of Minorities in New York City",
subtitle = "Flooding Impact by Rate of Minorities",
caption = "Data source: 2019 5-year ACS, US Census Bureau",
fill = "Concentration of Minorities") +
theme_void()
girafe(ggobj = P4) %>%
girafe_options(opts_hover(css = "fill:orange;"),
opts_zoom(max = 10))
#Graph to show employment rate income in floodplain by county
flood_train_sf_plot <- flood_train_sf %>%
group_by(geoid, county, census_tract) %>%
mutate(tooltip = paste(county, census_tract, employment_rate, sep = ": "))
P5 <- ggplot() +
geom_sf_interactive(
data = flood_train_sf_plot,
color = "white",
aes(fill = employment_rate, tooltip = tooltip, data_id = employment_rate), size = 0.2) +
geom_sf(
data = filter(flood_train_sf, flood == 1),
fill = "red", color = "red", alpha = 0.5) +
scale_fill_distiller(palette = "Blues", direction = 1) +
labs(
title = "Rate of Employment in New York City",
subtitle = "Flooding Impact by Rate of Employment",
caption = "Data source: 2019 5-year ACS, US Census Bureau",
fill = "Rate of Employment") +
theme_void()
girafe(ggobj = P5) %>%
girafe_options(opts_hover(css = "fill:orange;"),
opts_zoom(max = 10))
#Graph to show poverty rate income in floodplain by county
flood_train_sf_plot <- flood_train_sf %>%
group_by(geoid, county, census_tract) %>%
mutate(tooltip = paste(county, census_tract, poverty, sep = ": "))
P6 <- ggplot() +
geom_sf_interactive(
data = flood_train_sf_plot,
color = "white",
aes(fill = poverty,tooltip = tooltip, data_id = poverty), size = 0.2) +
geom_sf(
data = filter(flood_train_sf, flood == 1),
fill = "red",
color = "red",
alpha = 0.5) +
labs(
title = " Poverty in New York City",
subtitle = "Flooding Impact by Incidences of Poverty",
caption = "Data source: 2019 5-year ACS, US Census Bureau",
fill = "Poverty Levels") +
theme_void()
girafe(ggobj = P6) %>%
girafe_options(opts_hover(css = "fill:orange;"),
opts_zoom(max = 10))
flood_rec <-
recipe(flood ~ ., data = flood_train_df) %>%
# center and scale all predictors
step_center(all_predictors()) %>%
step_scale(all_predictors())
flood_folds <- vfold_cv(flood_train_df, v = 10, repeats = 1)
# Decision Tree Model
cart_mod <- decision_tree(tree_depth = tune()) %>%
set_engine(engine = "rpart") %>%
set_mode(mode = "classification")
cart_grid <- grid_regular(tree_depth(), levels = 10)
cart_wf <- workflow() %>%
add_recipe(flood_rec) %>%
add_model(cart_mod)
cart_cv <- cart_wf %>%
tune_grid(
resamples = flood_folds,
grid = cart_grid,
metrics = metric_set(roc_auc)
)
# Logistic Model
# run an initial logistic regression to get data for variable importance
floodlogit <- glm(flood ~., data = flood_train_df, family = "binomial")
# displays importance of 10 most important variables
importances <- varImp(floodlogit)
importances %>%
arrange(desc(Overall)) %>%
top_n(10)
## Overall
## employed 6.385857
## employment_rate 5.805587
## bachelor 4.560309
## unemployed 3.768257
## asian 3.305981
## total_by_age 3.136806
## lon 2.961183
## hhincome 1.727637
## lat 1.636568
## poverty 1.284834
# According to https://stackoverflow.com/questions/47822694/logistic-regression-tuning-parameter-grid-in-r-caret-package/48218280#48218280, there is no tunning parameter for glm model.
logistic_mod <- logistic_reg() %>%
set_engine("glm") %>%
set_mode("classification")
logistic_wf <- workflow() %>%
add_model(logistic_mod) %>%
add_recipe(flood_rec)
logistic_cv <- logistic_wf %>%
fit_resamples(
resample = flood_folds,
metrics = metric_set(roc_auc)
)
# KNN Model
flood_knn_mod <-
nearest_neighbor(neighbors = tune()) %>%
set_engine(engine = "kknn") %>%
set_mode(mode = "classification")
knn_grid <- grid_regular(neighbors(), levels = 10)
flood_knn_wf <- workflow() %>%
add_recipe(flood_rec) %>%
add_model(flood_knn_mod)
flood_knn_cv <- flood_knn_wf %>%
tune_grid(
resample = flood_folds,
grid = knn_grid,
metrics = metric_set(roc_auc)
)
cart_roc <- cart_cv %>%
collect_metrics(summarize = FALSE) %>%
group_by(id) %>%
summarise(cart_roc_auc = mean(.estimate))
logistic_roc <- logistic_cv %>%
collect_metrics(summarize = FALSE) %>%
group_by(id) %>%
summarise(logistic_roc_auc = mean(.estimate))
knn_roc <- flood_knn_cv %>%
collect_metrics(summarize = FALSE) %>%
group_by(id) %>%
summarise(knn_roc_auc = mean(.estimate))
roc_auc <- bind_cols(
fold = cart_roc$id,
cart = cart_roc$cart_roc_auc,
logistic = logistic_roc$logistic_roc_auc,
knn = knn_roc$knn_roc_auc
) %>%
pivot_longer(
cols = 2:4,
names_to = "model",
values_to = "roc_auc"
)
P7 <- roc_auc %>%
ggplot() +
geom_boxplot(
aes(x = model, y = roc_auc)
) +
labs(
title = "Compare the ROC_AUC across Decision Tree, Logistic, KNN",
subtitle = "Decision Tree has the highest ROC_AUC",
x = "Models"
)
T1 <- roc_auc %>%
group_by(model) %>%
summarise(
roc_auc = mean(roc_auc)
)
P7 + gridExtra::tableGrob(T1)

# CART is the best model
flood_cart_best <- cart_cv %>%
select_best(metric = "roc_auc")
flood_final_wf <- cart_wf %>%
finalize_workflow(
parameters = flood_cart_best
)
flood_final_fit <- flood_final_wf %>%
fit(data = flood_train_df)
rpart.plot::rpart.plot(x = flood_final_fit$fit$fit$fit)

flood_assess <- bind_cols(
flood_test_sf,
predict(object = flood_final_fit, new_data = flood_test_df, type = "class"),
predict(object = flood_final_fit, new_data = flood_test_df, type = "prob")
) %>%
mutate(
assessment = case_when(
flood == 1 & .pred_class == 1 ~ "true positive",
flood == 1 & .pred_class == 0 ~ "false negative",
flood == 0 & .pred_class == 1 ~ "false positive",
flood == 0 & .pred_class == 0 ~ "true negative"
)
)
P_assess1 <- flood_assess %>%
roc_curve(truth = flood, estimate = .pred_0) %>%
ggplot(aes(x = 1 - specificity, y = sensitivity)) +
geom_path() +
geom_abline(lty = 3) +
coord_equal() +
theme_minimal()
P_assess2 <- flood_assess %>%
ggplot() +
geom_sf(aes(fill = assessment), color = "white", alpha = 0.7)+
theme_void() +
labs(
title = "Assessment of the Finalized Knn Model",
subtitle = "True postive and true negative take large share of the total",
fill = ""
)
P_assess1 + P_assess2

flood_train_sf %>%
group_by(flood)%>%
summarize(n = sum(total_by_age))
## Simple feature collection with 2 features and 2 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -74.25609 ymin: 40.4961 xmax: -73.70036 ymax: 40.91771
## Geodetic CRS: WGS 84
## # A tibble: 2 × 3
## flood n geometry
## <fct> <dbl> <MULTIPOLYGON [°]>
## 1 0 3778613 (((-74.16805 40.55272, -74.16793 40.55294, -74.16618 40.55292, …
## 2 1 2078265 (((-74.25237 40.49948, -74.25425 40.50188, -74.25459 40.5023, -…